NOTE: This analysis requires at least 10Gb of RAM to run.
input_folder <- "raw_input"
output_folder <- "results"
output_format <- ".png"
The data set at the below URL was downloaded and uncompressed:
http://taraoceans.sb-roscoff.fr/EukDiv/data/Database_W5_OTU_occurences.tsv.zip
library(metacoder)
data <- parse_taxonomy_table(file.path(input_folder, "Database_W5_OTU_occurences.tsv"),
taxon_col = c("class" = -9), class_sep = "\\|")
The sample data was downloaded from the URL below:
http://taraoceans.sb-roscoff.fr/EukDiv/data/Database_W1_Sample_parameters.xls
sample_data <- readxl::read_excel(file.path(input_folder, "Database_W1_Sample_parameters.xls"))
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00
sample_columns <- sample_data[["PANGAEA ACCESSION NUMBER"]]
data <- mutate_taxa(data,
read_counts = vapply(obs(data),
function(x) sum(data$obs_data$totab[x]), numeric(1)))
seed = 9 #9, 10, 12 is good
set.seed(seed)
taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
background_color <- "#ccfff7"
data %>%
filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
filter_taxa(read_counts >= 100) %>%
filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
heat_tree(title = "Plankton diversity in the sunlit ocean",
title_size = 0.03,
node_color_axis_label = "Number of reads (Abundance)",
node_size_axis_label = "Number of species (OTUs)",
node_size = n_obs,
node_size_range = c(0.0012, NA),
node_color = read_counts,
node_color_range = c("grey", "#80cdc1", "#018571", "#dfc27d", "#a6611a"),
node_color_trans = "log10",
node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)) &
read_counts > 10000,
name, NA),
node_label_color = "#000000",
node_label_color_trans = "area",
node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
node_label_size_trans = "area",
node_label_size_range = c(0.001, NA),
node_label_max = 1000,
tree_label = name,
tree_label_color = "#00806c",
tree_label_max = 100,
initial_layout = "re", layout = "da",
overlap_avoidance = .65,
background_color = background_color,
maxiter = 50, fineiter = 50,
margin_size = c(0.001, 0.001),
output_file = file.path(output_folder, paste0("sup_figure_1--tara_all_plankton_diversity", output_format)))
For each taxon we will identify the proportion of OTUs with less than a 90% match to the most similar reference sequence to approximate classification certainty.
data <- mutate_obs(data, pid = as.numeric(pid))
data <- mutate_taxa(data, mean_pid = vapply(obs(data),
function(x) mean(data$obs_data$pid[x], na.rm = TRUE), numeric(1)))
# Percentage of OTUs with less than 90% idententiy
data <- mutate_taxa(data, percent_known = vapply(obs(data),
function(x) sum(data$obs_data$pid[x] >= 90, na.rm = TRUE) / length(x) * 100, numeric(1)))
seed = 1
set.seed(seed)
taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
data %>%
filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
filter_taxa(read_counts >= 100) %>%
filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
heat_tree(title = "Poportion of OTUs not well identified",
title_size = 0.03,
node_color_axis_label = "Percent of species (OTUs) identified",
node_size_axis_label = "Number of species (OTUs)",
node_size = n_obs,
node_size_range = c(0.0012, NA),
node_color = percent_known,
node_color_range = c("red", "orange", "yellow", "green", "cyan"),
node_color_trans = "linear",
node_color_interval = c(0, 100),
node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
name, NA),
node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
node_label_size_trans = "area",
node_label_size_range = c(0.001, NA),
node_label_max = 1000,
tree_label = name,
tree_label_max = 100,
initial_layout = "re", layout = "da",
overlap_avoidance = .65,
maxiter = 50, fineiter = 50,
margin_size = c(0.001, 0.001),
output_file = file.path(output_folder, paste0("sup_figure_2--tara_proportion_identified", output_format)))
set.seed(64)
taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
taxa_patterns_to_remove <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
data %>%
filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
filter_taxa(read_counts >= 200) %>%
filter_taxa(name == "Metazoa", subtaxa = TRUE) %>%
heat_tree(node_color_axis_label = "Percent of OTUs identified",
edge_color_axis_label = "Percent of OTUs identified",
edge_size = read_counts,
edge_size_range = c(0.001, 0.01),
edge_color = percent_known,
edge_color_range = c("red", "orange", "yellow", "greenyellow", "green"),
edge_color_trans = "linear",
node_size_axis_label = "Number of OTUs",
edge_size_axis_label = "Number of reads",
node_size = n_obs,
node_size_range = c(0.005, 0.03),
node_color = percent_known,
node_color_range = c("red", "orange", "yellow", "greenyellow", "green"),
node_color_trans = "linear",
node_color_interval = c(0, 100),
node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
name, NA),
node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
node_label_size_range = c(0.008, 0.012),
node_label_size_trans = "area",
node_label_max = 1000,
initial_layout = "fr", layout = "da",
overlap_avoidance = .8,
maxiter = 50, fineiter = 50,
weight.node.edge.dist = 15,
output_file = file.path(output_folder,
paste0("figure_1--tara_metazoa--seed_",
seed, output_format)))
Comments